# Kernel: sgemm_nn_128x64

# Copyright 2014 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#    http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


<CONSTANT_MAPPING>
    addr_zero  : 4x<128*8*2 + 64*8*2 + 0>

    gridDimA : c[0x0][0x14]
    gridDimB : c[0x0][0x18]

    param_Rand[0]   : c[0x0][0x140]
    param_Rand[1]   : c[0x0][0x144]
    param_A[0]      : c[0x0][0x148]
    param_A[1]      : c[0x0][0x14c]
    param_B[0]      : c[0x0][0x150]
    param_B[1]      : c[0x0][0x154]
    param_C[0]      : c[0x0][0x158]
    param_C[1]      : c[0x0][0x15c]
    param_lda       : c[0x0][0x160]
    param_ldb8      : c[0x0][0x164]
    param_ldc       : c[0x0][0x168]
    param_m         : c[0x0][0x16c]
    param_n         : c[0x0][0x170]
    param_k         : c[0x0][0x174]
    param_alpha     : c[0x0][0x178]
    param_beta      : c[0x0][0x17c]
    param_flags     : c[0x0][0x180]
    param_ldaz      : c[0x0][0x184]
    param_ldbz      : c[0x0][0x188]
    param_ldcz      : c[0x0][0x18c]
    param_loops     : c[0x0][0x190]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

    64-95   ~ tid, blkA, blkB, blkZ, txb, tidAY, tidBY, tidAX, tidBX, lda, ldb, ldaz, ldbz, tid1, tid2, tid7, txa, ta, xmad_ta, tb, tid15, xmad_tb, k<1-3>, x<1-3>

    0-63    : czero<00-63>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

    64-79   : j0Ay<0-7>, j0Bx<0-7>
    80-95   : j1Ay<0-7>, j1Bx<0-7>

    96-115  : loadAA<0-7>, loadA<0-7>, loadB<0-3>

    116-121 : track0A<0-1>, track1A<0-1>, trackB<0-1>

    122-125 ~ writeAs, writeBs, k, swapBuf
    126-127 ~ readAs, readBs

    64-75   : c<0-7>, d3, d2, d1, d0
    76-85   : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
    86-125  ~ tid_2, blockA, blockB, blockZ, ldc, ldc1, ldc4, ldc60, ldcz, writeCs, readCs, cx<00|32>, cy<00|04|08|12>, ci, xmad_c, alpha, beta, flags, tid31, tid96

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,  SR_TID.X;
--:-:2:-:1      S2R blkA, SR_CTAID.Y;
--:-:3:-:1      S2R blkB, SR_CTAID.Z;
--:-:4:-:1      S2R blkZ, SR_CTAID.X;

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV k,   param_k;
--:-:-:-:1      MOV lda, param_lda;
--:-:-:-:1      MOV ldb, param_ldb8;
--:-:-:-:1      SHR.U32 ldb, ldb, 5;
--:-:-:-:1      MOV ldaz, param_ldaz;
--:-:-:-:1      MOV ldbz, param_ldbz;

--:-:-:-:1      STS.128 [addr_zero], RZ;

<CODE>
    return join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
</CODE>

// tidAY = (tid & 1) << 2
// tidAX = tid >> 1
01:-:-:-:1      LOP.AND tid1,  tid,  1;
--:-:-:-:1      SHL     tidAY, tid1, 2;
01:-:-:-:1      SHR.U32 tidAX, tid,  1;

// trackA += 4 * ((blkA*128 + tidAX) * lda + tidAY)
02:-:-:-:1      ISCADD  txa, blkA, tidAX, 7;
--:-:-:-:1      XMAD.LO  ta, lda,  txa,   tidAY, xmad_ta;
08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ,  ta;
--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     2;
--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 2;
--:-:-:-:1      LEA      track1A0.CC, lda, track0A0,      8;
--:-:-:-:1      LEA.HI.X track1A1,    lda, track0A1, RZ,  8;

--:-:-:-:1      ISETP.LT.AND P4, PT, txa, param_m, PT;
--:-:-:-:1      IADD txa, txa, 64;
--:-:-:-:1      ISETP.LT.AND P5, PT, txa, param_m, PT;

// tidBX = (tid & 15) << 2
// tidBY = (tid >> 4) & 7
--:-:-:-:1      LOP.AND tid15, tid,   15;
--:-:-:-:1      SHL     tidBX, tid15, 2;
--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4

// trackB += (blkB*64 + tidX + ldb*tidBY) * 4
04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
--:-:-:-:1      XMAD.LO2 tb,  ldb,  tidBY, txb;
08:-:-:-:1      XMAD.LO2 tb,  ldbz, blkZ,  tb;
--:-:-:-:1      LEA      trackB0.CC, tb, param_B[0],     0x2;
--:-:-:-:2      LEA.HI.X trackB1,    tb, param_B[1], RZ, 0x2;

--:-:-:-:1      ISETP.LT.AND P6, PT, txb, param_n, PT;

// Start the write buffers high
// writeAs = (128*tidAY + tidAX) * 4
--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 7;
--:-:-:-:1      ISCADD writeAs, writeAs, 4x<64*8 + 128*8>, 2;

// writeBs = (64*tidBY + tidX) * 4
--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
--:-:-:-:1      ISCADD writeBs, writeBs, 4x<64*8 + 128*8*2>, 2;

// Start the read buffers low
// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
--:-:-:-:1      LOP.AND readAs, tid,    0x70;
--:-:-:-:1      SHR.U32 readAs, readAs, 3;
--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
--:-:-:-:1      SHL     readAs, readAs, 4;
// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      ISCADD  readBs, readBs, 4x<128*8>, 4;

--:-:-:-:1      MOV32I swapBuf, -4x<64*8 + 128*8>;
</SCHEDULE_BLOCK>

REMAINDER:

<CODE>
    our $vec;
    return $vec ? q{

// k must be multiple of 8
--:-:2:-:1  @P6 LDG.E.CI.128 loadB0, [trackB];

--:-:3:-:1  @P4 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];
--:-:3:-:1  @P4 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];

--:-:4:-:1  @P5 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];
--:-:4:-:1  @P5 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];

--:-:-:-:1 @!P6 LDS.U.128 loadB0, [addr_zero];
--:-:-:-:1 @!P4 LDS.U.128 loadA0, [addr_zero];
--:-:6:-:1 @!P5 LDS.U.128 loadA4, [addr_zero];
--:-:-:-:1 @!P4 LDS.U.128 loadAA0, [addr_zero];
--:-:-:-:1 @!P5 LDS.U.128 loadAA4, [addr_zero];

--:-:-:-:0      PSETP.AND.AND P1, PT, PT, PT, PT;

22:-:-:-:1      STS.128 [writeBs], loadB0;

--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
--:-:-:-:0      IADD.X trackB1, trackB1, RZ;

04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;

--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<16>;
--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;

08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;

--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<16>;
--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;

--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;

    } : q{

--:-:1:-:1      S2R tid,  SR_TID.X;
--:-:2:-:1      S2R blkB, SR_CTAID.Z;
<SCHEDULE_BLOCK>
01:-:-:-:1      LOP.AND tid1,  tid,   1;
--:-:-:-:1      SHL     tidAY, tid1,  2;
--:-:-:-:1      LOP.AND tid15, tid,   15;
--:-:-:-:1      SHL     tidBX, tid15, 2;
--:-:-:-:1      BFE.U32 tidBY, tid,   0x304; // 3 bits at position 4
02:-:-:-:1      ISCADD  txb, blkB, tidBX, 6;

// doLoad0 = tidBY < k
--:-:-:-:1      IADD x1, txb, 1;
--:-:-:-:1      IADD x2, txb, 2;
--:-:-:-:1      IADD x3, txb, 3;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, k, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, x1, param_n, P0;
--:-:-:-:1      ISETP.LT.AND P2, PT, x2, param_n, P0;
--:-:-:-:1      ISETP.LT.AND P3, PT, x3, param_n, P0;

--:-:2:-:1  @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
--:-:2:-:1  @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
--:-:2:-:1  @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
--:-:2:-:1  @P3 LDG.E.CI loadB3, [trackB + 4x<3>];

--:-:-:-:1 @!P0 MOV loadB0, RZ;
--:-:-:-:1 @!P1 MOV loadB1, RZ;
--:-:-:-:1 @!P2 MOV loadB2, RZ;
--:-:-:-:1 @!P3 MOV loadB3, RZ;

--:-:-:-:1      IADD k1, tidAY, 1;
--:-:-:-:1      IADD k2, tidAY, 2;
--:-:-:-:1      IADD k3, tidAY, 3;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P4;
--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P4;
--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P4;
--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P4;

--:-:3:-:1  @P0 LDG.E.CI loadA0, [track0A + 4x<0>];
--:-:3:-:1  @P1 LDG.E.CI loadA1, [track0A + 4x<1>];
--:-:3:-:1  @P2 LDG.E.CI loadA2, [track0A + 4x<2>];
--:-:3:-:1  @P3 LDG.E.CI loadA3, [track0A + 4x<3>];

--:-:-:-:1 @!P0 MOV loadA0, RZ;
--:-:-:-:1 @!P1 MOV loadA1, RZ;
--:-:-:-:1 @!P2 MOV loadA2, RZ;
--:-:-:-:1 @!P3 MOV loadA3, RZ;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidAY, k, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, k1, k, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, k2, k, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, k3, k, P5;

--:-:4:-:1  @P0 LDG.E.CI loadA4, [track1A + 4x<0>];
--:-:4:-:1  @P1 LDG.E.CI loadA5, [track1A + 4x<1>];
--:-:4:-:1  @P2 LDG.E.CI loadA6, [track1A + 4x<2>];
--:-:4:-:1  @P3 LDG.E.CI loadA7, [track1A + 4x<3>];

--:-:-:-:1 @!P0 MOV loadA4, RZ;
--:-:-:-:1 @!P1 MOV loadA5, RZ;
--:-:-:-:1 @!P2 MOV loadA6, RZ;
--:-:-:-:1 @!P3 MOV loadA7, RZ;
</SCHEDULE_BLOCK>

02:-:-:-:1      STS.128 [writeBs], loadB0;

--:-:-:-:6      IADD   trackB0.CC, trackB0, param_ldb8;
--:-:-:-:0      IADD.X trackB1, trackB1, RZ;

04:-:-:-:1      STS [writeAs + 4x<0*128 + 00>], loadA0;
--:-:-:-:1      STS [writeAs + 4x<1*128 + 00>], loadA1;
--:-:-:-:1      STS [writeAs + 4x<2*128 + 00>], loadA2;
--:-:-:-:1      STS [writeAs + 4x<3*128 + 00>], loadA3;

--:-:-:-:6      IADD   track0A0.CC, track0A0, 4x<8>;
--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;

08:-:-:-:1      STS [writeAs + 4x<0*128 + 64>], loadA4;
--:-:-:-:1      STS [writeAs + 4x<1*128 + 64>], loadA5;
--:-:-:-:1      STS [writeAs + 4x<2*128 + 64>], loadA6;
--:-:-:-:1      STS [writeAs + 4x<3*128 + 64>], loadA7;

--:-:-:-:6      IADD   track1A0.CC, track1A0, 4x<8>;
--:-:-:-:1      IADD.X track1A1,    track1A1, RZ;

--:-:-:-:1      IADD readBs,  readBs, -swapBuf;
--:-:-:-:0      IADD readAs,  readAs, -swapBuf;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;

--:-:-:-:0      ISETP.GT.AND P1, PT, k, 8, PT;

    };
</CODE>

<CODE>
    our $vec;
    my $k_end = $vec ? 16 : 24;
    our @top = ("--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, P6;\n");
    our %insert =
    (
        ($vec ? 
            (
        j0c1  => "--:-:-:-:1      PSETP.AND.AND P1, PT, !P1, PT, PT;\n",

        j0c11 => "--:-:2:-:1  \@P0 LDG.E.CI.128 loadB0, [trackB];\n",

        j0c12 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
        j0c13 => "--:-:-:-:1      IADD32I k, k, -8;\n",

        j0c23 => "--:-:-:-:1      PSETP.AND.AND P2, PT, P0, P1, P4;\n",
        j0c24 => "--:-:-:-:1      PSETP.AND.AND P3, PT, P0, P1, P5;\n",

        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadA0,  [track0A + 4x<0>];\n",
        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI.128 loadAA0, [track0A + 4x<8>];\n",

        j0c39 => "--:-:4:-:1  \@P3 LDG.E.CI.128 loadA4,  [track1A + 4x<0>];\n",
        j0c41 => "10:6:5:-:1  \@P3 LDG.E.CI.128 loadAA4, [track1A + 4x<8>];\n",

        j2c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 00>], loadAA0;\n",
        j2c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 00>], loadAA1;\n",
        j2c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 00>], loadAA2;\n",
        j2c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 00>], loadAA3;\n",

        j3c29 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<0*128 + 64>], loadAA4;\n",
        j3c31 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<1*128 + 64>], loadAA5;\n",
        j3c33 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<2*128 + 64>], loadAA6;\n",
        j3c35 => "--:-:-:-:1 \@!P1 STS [writeAs + 4x<3*128 + 64>], loadAA7;\n",

        j5c29 => "04:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
        j5c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
        j5c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
        j5c35 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",

        j5c46 => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 4x<16>;\n",
        j5c54 => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",

        j6c29 => "08:-:-:-:1  \@P1 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
        j6c31 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
        j6c33 => "--:-:-:-:1  \@P1 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
        j6c35 => "--:2:-:-:1  \@P1 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",

        j6c46 => "20:-:-:-:1  \@P3 IADD   track1A0.CC, track1A0, 4x<16>;\n",
        j6c54 => "--:-:-:-:1  \@P3 IADD.X track1A1,    track1A1, RZ;\n",

        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
            ) :
            (
        j0c1  => "--:-:-:-:1      ISETP.GE.AND P2, PT, k, $k_end, P4;\n",
        j0c3  => "--:-:-:-:1      ISETP.GE.AND P3, PT, k, $k_end, P5;\n",

        j0c10 => "--:-:2:-:1  \@P0 LDG.E.CS loadB0, [trackB + 4x<0>];\n",
        j0c12 => "--:-:2:-:1  \@P0 LDG.E.CS loadB1, [trackB + 4x<1>];\n",
        j0c14 => "--:-:2:-:1  \@P0 LDG.E.CS loadB2, [trackB + 4x<2>];\n",
        j0c16 => "--:-:2:-:1  \@P0 LDG.E.CS loadB3, [trackB + 4x<3>];\n",

        j0c18 => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, $k_end, PT;\n",
        j0c20 => "--:-:-:-:1      IADD32I k, k, -8;\n",

        j0c33 => "--:-:3:-:1  \@P2 LDG.E.CI loadA0, [track0A + 4x<0>];\n",
        j0c35 => "--:-:3:-:1  \@P2 LDG.E.CI loadA1, [track0A + 4x<1>];\n",
        j0c37 => "--:-:3:-:1  \@P2 LDG.E.CI loadA2, [track0A + 4x<2>];\n",
        j0c39 => "--:-:3:-:1  \@P2 LDG.E.CI loadA3, [track0A + 4x<3>];\n",

        j1c29 => "--:-:4:-:1  \@P3 LDG.E.CI loadA4, [track1A + 4x<0>];\n",
        j1c31 => "--:-:4:-:1  \@P3 LDG.E.CI loadA5, [track1A + 4x<1>];\n",
        j1c33 => "--:-:4:-:1  \@P3 LDG.E.CI loadA6, [track1A + 4x<2>];\n",
        j1c35 => "--:-:4:-:1  \@P3 LDG.E.CI loadA7, [track1A + 4x<3>];\n",

        j5c29 => "04:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 00>], loadA0;\n",
        j5c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 00>], loadA1;\n",
        j5c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 00>], loadA2;\n",
        j5c35 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 00>], loadA3;\n",

        j5c46 => "--:-:-:-:1  \@P0 IADD   track0A0.CC, track0A0, 4x<8>;\n",
        j5c54 => "--:-:-:-:1  \@P0 IADD.X track0A1,    track0A1, RZ;\n",

        j6c29 => "08:-:-:-:1  \@P0 STS [writeAs + 4x<0*128 + 64>], loadA4;\n",
        j6c31 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*128 + 64>], loadA5;\n",
        j6c33 => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*128 + 64>], loadA6;\n",
        j6c35 => "--:2:-:-:1  \@P0 STS [writeAs + 4x<3*128 + 64>], loadA7;\n",

        j6c46 => "--:-:-:-:1  \@P0 IADD   track1A0.CC, track1A0, 4x<8>;\n",
        j6c54 => "--:-:-:-:1  \@P0 IADD.X track1A1,    track1A1, RZ;\n",

        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n" .
                 "--:-:-:Y:5  \@P1 BRA.U REMAINDER;\n",
            )
        ),

        j4c21 => "02:-:-:-:1  \@P0 STS.128 [writeBs], loadB0;\n",

        j4c22 => "--:-:-:-:1  \@P0 IADD   trackB0.CC, trackB0, param_ldb8;\n",
        j4c27 => "--:-:-:-:1  \@P0 IADD.X trackB1,    trackB1, RZ;\n",

        j6c63 => "02:-:-:-:5  \@P0 BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",


    );
    return;
</CODE>

<INCLUDE file="nervanagpu/kernels/sass/sgemm_common_128x64.sass"/>